package crawler;

import cn.edu.hfut.dmic.webcollector.crawler.DeepCrawler;
import cn.edu.hfut.dmic.webcollector.crawler.OnCrawlerVisitListener;
import cn.edu.hfut.dmic.webcollector.model.Links;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.net.HttpRequester;
import cn.edu.hfut.dmic.webcollector.net.HttpRequesterImpl;
import crawler.classes.Article;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.*;

public class CrawlerSouGou extends DeepCrawler {
	public int wait_time = 1000;

	@Override
	public HttpRequester getHttpRequester() {
		return super.getHttpRequester();
	}

	public String type = "sougou";
	public String key_word;
	public String next_link_regex=".*page=\\d+.*";

	public ArrayList<String> current_urls = new ArrayList<String>();
	public ArrayList<String> current_articles = new ArrayList<String>();
	public boolean isUpdate;
	public CrawlerSouGou(String crawlPath, String key_word, boolean isUpdate) {
		super(crawlPath);
		this.isUpdate = isUpdate;
		ArticlesService.SaveKeyWord(key_word);
		this.key_word = key_word;
		this.addSeed("http://news.sogou.com/news?query=" + URLEncoder.encode(key_word) );
		//爬取数据量少，重写httprequest设置 header
		HttpRequesterImpl httpRequester1= (HttpRequesterImpl) this.getHttpRequester();
		httpRequester1.setHeader("User-agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36");
		httpRequester1.setHeader("Host", "news.sogou.com");
		httpRequester1.setHeader("Accept-Language", "zh-CN,zh;q=0.8");
		httpRequester1.setHeader("Accept-Encoding", "gzip");



	}
	@Override
	public Links visitAndGetNextLinks(Page page) {
		extractArticles(page);
		try {
			System.out.println("暂停时间:" + wait_time / 1000 + " 秒");
			Thread.sleep(this.wait_time);
		} catch (InterruptedException e) {
			e.printStackTrace();
		}
		Links ls = getPageLinks(page);
		return ls;
	}

	private Links getPageLinks(Page page) {
		Links ls = new Links();
		Elements elements = page.getDoc().getElementById("pagebar_container").select("a");
		System.out.println(page.getUrl()+"\n"+elements.size());
		for (Element a : elements) {
			if (a.attr("href").matches(this.next_link_regex)) {
				String url = "http://news.sogou.com/news" + a.attr("href");
				if (!this.current_urls.contains(url)) {
					ls.add(url);
					System.out.println("添加下次采集链接:" + a.attr("href"));
					this.current_urls.add(url);
				}
			}
		}
		return ls;
	}

	public void extractArticles(Page page) {
		Document doc = page.getDoc();
		List<Article> articleList = new ArrayList<Article>();
		String url;
		String title;
		String content;
		String src_html;
		String create_time;
		String full_content="";
		String full_html="";

		Elements elements = doc.select(".vrwrap");
		for (Element element : elements) {
			//不包括推荐的新闻
			if(element.select(".vrTitle").size()>0){
			url = element.select(".vrTitle").get(0).getElementsByTag("a").attr("href");
			title = element.select(".vrTitle").get(0).text();
			content = element.select(".news-txt").text();
			src_html = element.select(".news-txt").html();
			create_time = element.select(".news-from").text();
			try {
				Pattern regex = Pattern.compile("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}", Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE | Pattern.MULTILINE);
				Matcher regexMatcher = regex.matcher(create_time);
				if (regexMatcher.find()) {
					create_time = regexMatcher.group();
				}
			} catch (PatternSyntaxException ex) {
				// Syntax error in the regular expression
			}
				if (url != null && title != null && content != null && src_html != null && create_time != null
					&& !this.current_articles.contains(url)) {
					System.out.println(page.getUrl());
					System.out.println(url);
					Document fulldoc= null;
					try {
						fulldoc = Jsoup.connect(url).get();
						full_content=fulldoc.text();
						full_html=fulldoc.html();
					} catch (Exception e) {
						e.printStackTrace();
					}
				Article article = new Article(this.type, this.key_word, url, title, content, src_html, create_time,full_content,full_html);
				// article.printInfo();
				articleList.add(article);
				current_articles.add(url);
				if (onCrawlerVisitListener != null) {
					onCrawlerVisitListener.onVisit(url);
				}
				this.totalSize=current_articles.size();
			}

		}
		}


		int updateSize = ArticlesService.saveMore(articleList,isUpdate);
		if(isUpdate){
			if (onCrawlerVisitListener != null) {
				onCrawlerVisitListener.onUpdate(updateSize);
			}
		}
		System.out.println("当前采集完成量:"+current_articles.size());

	}


	public static void main(String[] args) throws Exception {
		CrawlerSouGou crawler = new CrawlerSouGou("./tmp", "张宇",false);
		//crawler.current_urls = new ArrayList<String>();
		crawler.setThreads(4);
		crawler.setOnCrawlerVisitListener(new OnCrawlerVisitListener() {
			
			@Override
			public int onVisit(String url) {
				return 0;
			}
			@Override
			public int onComplete(int count) {
				return 0;
			}
			@Override
			public int onUpdate(int updateSize) {
				// TODO Auto-generated method stub
				return 0;
			}
		});
		crawler.start(20);
	}


}
